/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.openkm.module.jcr.stuff.apache;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.jackrabbit.extractor.CompositeTextExtractor;
import org.apache.jackrabbit.extractor.DelegatingTextExtractor;
import org.apache.jackrabbit.extractor.EmptyTextExtractor;
import org.apache.jackrabbit.extractor.TextExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Backwards-compatible Jackrabbit text extractor component. This class
 * implements the following functionality:
 * <ul>
 *   <li>
 *     Parses the configured {@link TextExtractor} and {@link TextFilter}
 *     class names and instantiates the configured classes.
 *   </li>
 *   <li>
 *     Acts as the delegate extractor for any configured
 *     {@link DelegatingTextExtractor} instances.
 *   </li>
 *   <li>
 *     Maintains a {@link CompositeTextExtractor} instance that contains
 *     all the configured extractors and to which all text extraction calls
 *     are delegated.
 *   </li>
 *   <li>
 *     Creates a {@link TextFilterExtractor} adapter for a configured
 *     {@link TextFilter} instance when it is first used and adds that adapter
 *     to the composite extractor for use in text extraction.
 *   </li>
 *   <li>
 *     Logs a warning and creates a dummy {@link EmptyTextExtractor} instance
 *     for any unsupported content types when first detected. The dummy
 *     extractor is added to the composite extractor to prevent future
 *     warnings about the same content type.
 *   </li>
 * </ul>
 */
public class JackrabbitTextExtractor implements TextExtractor {
	
	/**
	 * Logger instance.
	 */
	private static final Logger logger = LoggerFactory.getLogger(JackrabbitTextExtractor.class);
	
	/**
	 * Set of content types that are known to be supported by the
	 * composite extractor.
	 */
	@SuppressWarnings("rawtypes")
	private final Set types = new HashSet();
	
	/**
	 * Composite extractor used to for all text extration tasks. Contains
	 * all the {@link TextExtractor} instances for directly supported content
	 * types, the {@link TextFilterExtractor} adapters for backwards
	 * compatibility with configured {@link TextFilter} instances that have
	 * already been used, and the dummy {@link EmptyTextExtractor} instances
	 * created for unsupported content types.
	 */
	private final CompositeTextExtractor extractor = new CompositeTextExtractor();
	
	/**
	 * Configured {@link TextFilter} instances. Used for backwards
	 * compatibility with existing configuration files and {@link TextFilter} implementations.
	 */
	@SuppressWarnings("rawtypes")
	private final Collection filters = new ArrayList();
	
	public JackrabbitTextExtractor() {
	}
	
	/**
	 * Creates a Jackrabbit text extractor containing the configured component
	 * classes.
	 * 
	 * @param classes configured {@link TextExtractor} (and {@link TextFilter})
	 *        class names (space- or comma-separated)
	 */
	@SuppressWarnings({ "deprecation", "unchecked" })
	public JackrabbitTextExtractor(List<String> classes) {
		logger.debug("JackrabbitTextExtractor({})", classes);
		
		for (String clazz : classes) {
			try {
				Object object = Class.forName(clazz).newInstance();
				
				if (object instanceof DelegatingTextExtractor) {
					((DelegatingTextExtractor) object).setDelegateTextExtractor(this);
				}
				
				if (object instanceof TextExtractor) {
					extractor.addTextExtractor((TextExtractor) object);
				} else if (object instanceof TextFilter) {
					filters.add(object);
				} else {
					logger.warn("Unknown text extractor class: {}", clazz);
				}
			} catch (ClassNotFoundException e) {
				logger.warn("Extractor class not found: " + clazz, e);
			} catch (LinkageError e) {
				logger.warn("Extractor dependency not found: " + clazz, e);
			} catch (IllegalAccessException e) {
				logger.warn("Extractor constructor not accessible: " + clazz, e);
			} catch (InstantiationException e) {
				logger.warn("Extractor instantiation failed: " + clazz, e);
			}
		}
		
		types.addAll(Arrays.asList(extractor.getContentTypes()));
	}
	
	// -------------------------------------------------------< TextExtractor >
	
	/**
	 * Returns the content types that the component extractors are known
	 * to support.
	 * 
	 * @return supported content types
	 */
	public String[] getContentTypes() {
		return extractor.getContentTypes(); // and then some
	}
	
	/**
	 * Extracts the text content from the given binary stream. The given
	 * content type is used to look up a configured text extractor to which
	 * to delegate the request.
	 * <p>
	 * If a matching extractor is not found, then the configured text filters searched for an instance that claims to
	 * support the given content type. A text extractor adapter is created for that filter and saved in the extractor
	 * map for future use before delegating the request to the adapter.
	 * <p>
	 * If not even a text filter is found for the given content type, a warning is logged and an empty text extractor is
	 * created for that content type and saved in the extractor map for future use before delegating the request to the
	 * empty extractor.
	 * 
	 * @param stream binary stream
	 * @param type content type
	 * @param encoding character encoding, or <code>null</code>
	 * @return reader for the text content of the binary stream
	 * @throws IOException if the binary stream can not be read
	 */
	public Reader extractText(InputStream stream, String type, String encoding) throws IOException {
		logger.debug("extractText(stream, {}, {})", type, encoding);
		return getExtractor(type).extractText(stream, type, encoding);
	}
	
	@SuppressWarnings({ "rawtypes", "deprecation", "unchecked" })
	public CompositeTextExtractor getExtractor(String type) throws IOException {
		logger.debug("getExtractor({})", type);
		
		if (!types.contains(type)) {
			Iterator iterator = filters.iterator();
			
			while (iterator.hasNext()) {
				TextFilter filter = (TextFilter) iterator.next();
				
				if (filter.canFilter(type)) {
					types.add(type);
					extractor.addTextExtractor(new TextFilterExtractor(type, filter));
					break;
				}
			}
		}
		
		if (!types.contains(type)) {
			logger.debug("Full text indexing of {} is not supported", type);
			types.add(type);
			extractor.addTextExtractor(new EmptyTextExtractor(type));
		}
		
		return extractor;
	}
}
